library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
library("biomaRt")
Download miRNA expression data from The Cancer Genome Atlas (TCGA): -
TCGA-COAD
refers to the biospecimen data for colon
adenocarcinoma.
query_tumor <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = "Primary Tumor"
)
tumor <- getResults(query_tumor)
tumor
query_normal <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = "Solid Tissue Normal"
)
normal <- getResults(query_normal)
normal
Consider only samples with both normal and malignant tissues.
submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
dplyr::select(cases.submitter_id)
tumor <- tumor %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
normal <- normal %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
samples <- rbind(tumor, normal)
invisible(unique(samples$sample_type))
samples
Download only samples with both normal and malignant tissues.
To impose this filtering, we set the barcode
argument of
GDCquery
to samples$sample.submitter_id
(which
was generated in the previous cell).
query_coad <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = c("Solid Tissue Normal", "Primary Tumor"),
barcode = as.list(samples$sample.submitter_id)
)
If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block), uncomment the code block below.
GDC_DIR = "../data/public/GDCdata"
# GDCdownload(
# query_coad,
# directory = GDC_DIR
# )
Running the code block above should generate and populate a directory
named GDCdata
.
Construct the RNA-seq count matrix.
tcga_coad_data <- GDCprepare(
query_coad,
directory = GDC_DIR,
summarizedExperiment = TRUE
)
rownames(tcga_coad_data) <- tcga_coad_data$miRNA_ID
count_matrix <- tcga_coad_data[, colnames(tcga_coad_data)[grep("count", colnames(tcga_coad_data))]]
colnames(count_matrix) <- gsub("read_count_", "", colnames(count_matrix))
# Remove duplicate entries
count_matrix_df <- data.frame(count_matrix)
count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
count_matrix <- data.matrix(count_matrix_df)
rownames(count_matrix) <- cleanid(rownames(count_matrix))
count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]
head(count_matrix[1:5, 1:4])
TCGA.A6.2684.01A.01T.1409.13 TCGA.A6.2671.01A.01T.1409.13 TCGA.AA.3520.01A.01T.0822.13 TCGA.A6.2680.01A.01T.1409.13
hsa-let-7a-1 14506 11763 4503 6055
hsa-let-7a-2 14299 11992 4589 5945
hsa-let-7a-3 14183 11822 4619 6018
hsa-let-7b 31896 15937 19505 9392
hsa-let-7c 3130 1488 751 141
Format the samples
table so that it can be fed as input
to DESeq2.
rownames(samples) <- samples$cases
samples <- samples %>%
dplyr::select(case = "cases.submitter_id", type = "sample_type")
samples$type <- str_replace(samples$type, "Solid Tissue Normal", "normal")
samples$type <- str_replace(samples$type, "Primary Tumor", "tumor")
DESeq2 requires the row names of samples
should be
identical to the column names of count_matrix
.
colnames(count_matrix) <- gsub(x = colnames(count_matrix), pattern = "\\.", replacement = "-")
count_matrix <- count_matrix[, rownames(samples)]
# Sanity check
all(colnames(count_matrix) == rownames(samples))
References:
Construct the DESeqDataSet
object.
dds <- DESeqDataSetFromMatrix(
countData = count_matrix,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
Display quality control (QC) plots (refer to https://cran.r-project.org/web/packages/RNAseqQC/vignettes/introduction.html)
CAVEAT: There seem to be some outliers — but we defer handling them for now!
plot_total_counts(dds)
plot_library_complexity(dds)
plot_gene_detection(dds)
Perform miRNA filtering.
We determined min_count
empirically by looking at the
red trend line in the variance stabilization plot. Ideally, this trend
line should be flat (i.e., stable).
dds <- filter_genes(dds, min_count = 10)
Transform the read counts.
From https://chipster.csc.fi/manual/deseq2-transform.html:
You can use the resulting transformed values only for visualization
and clustering, not for differential expression analysis which needs raw
counts.
dds <- estimateSizeFactors(dds)
nsub <- sum(rowMeans(counts(dds, normalized = TRUE)) > 10)
vsd <- vst(dds, nsub = nsub)
mean_sd_plot(vsd)
Check the clustering of the samples.
If you encounter the error
Error in loadNamespace(x) : there is no package called 'ComplexHeatmap'
,
uncomment and run the following code block:
# install.packages("devtools", dependencies = TRUE)
# devtools::install_github("jokergoo/ComplexHeatmap")
set.seed(42)
plot_sample_clustering(vsd, anno_vars = c("type"), distance = "euclidean")
Perform principal component analysis (PCA).
plot_pca(vsd, PC_x = 1, PC_y = 2, shape_by = "type")
Refer to
1. Exploratory Data Analysis - MSigDB Gene Sets + GTEx TPM.Rmd
for more detailed documentation on obtaining the gene sets.
We are going to refer to miRDB to for the miRNA-mRNA mapping: https://academic.oup.com/nar/article/48/D1/D127/5557729
The first column of the dataset lists the miRNAs, while the second
column lists the mRNAs (more specifically, the RefSeq IDs of the mRNAs).
However, our gene sets list the mRNAs of interest (i.e., those involved
in regulated cell death) using their gene symbols.
Hence, we need
to perform some preprocessing to convert the gene symbols to RefSeq
IDs.
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
Alternative miRNA-mRNA mapping tools/databases:
Fetch the necroptosis gene set.
necroptosis.genes <- msigdbr(species = "human", category = "C5", subcategory = "GO:BP") %>%
dplyr::filter(gs_name == "GOBP_NECROPTOTIC_SIGNALING_PATHWAY")
necroptosis.genes
Get the gene symbols of the genes included in the necroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- necroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "../data/public/rcd-gene-list/refseq/necroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Fetch the miRNAs targeting the mRNAs of interest.
necroptosis.mirna <- read.table("../data/public/miRNA/necroptosis-mirna.txt")
necroptosis.mirna <- unique(unlist(necroptosis.mirna$V1))
Filter the genes to include only those in the necroptosis gene set.
coad_necroptosis <- count_matrix[rownames(count_matrix) %in% necroptosis.mirna, ]
coad_necroptosis <- coad_necroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_necroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_necroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 5 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 82 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 40, 49%
LFC < 0 (down) : 36, 44%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange), max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
Fetch the ferroptosis gene set.
ferroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
dplyr::filter(gs_name == "WP_FERROPTOSIS")
ferroptosis.genes
Get the gene symbols of the genes included in the ferroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- ferroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "../data/public/rcd-gene-list/refseq/ferroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Fetch the miRNAs targeting the mRNAs of interest.
ferroptosis.mirna <- read.table("../data/public/miRNA/ferroptosis-mirna.txt")
ferroptosis.mirna <- unique(unlist(ferroptosis.mirna$V1))
Filter the genes to include only those in the ferroptosis gene set.
coad_ferroptosis <- count_matrix[rownames(count_matrix) %in% ferroptosis.mirna, ]
coad_ferroptosis <- coad_ferroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_ferroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_ferroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 13 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 256 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 114, 45%
LFC < 0 (down) : 100, 39%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange), max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
Fetch the pyroptosis gene set.
pyroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME") %>%
dplyr::filter(gs_name == "REACTOME_PYROPTOSIS")
pyroptosis.genes
Get the gene symbols of the genes included in the pyroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- pyroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "../data/public/rcd-gene-list/refseq/pyroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Fetch the miRNAs targeting the mRNAs of interest.
pyroptosis.mirna <- read.table("../data/public/miRNA/pyroptosis-mirna.txt")
pyroptosis.mirna <- unique(unlist(pyroptosis.mirna$V1))
Filter the genes to include only those in the pyroptosis gene set.
coad_pyroptosis <- count_matrix[rownames(count_matrix) %in% pyroptosis.mirna, ]
coad_pyroptosis <- coad_pyroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_pyroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_pyroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 9 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 184 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 84, 46%
LFC < 0 (down) : 71, 39%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange), max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com↩︎
De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph↩︎